{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import jieba\n",
    "import Spider\n",
    "\n",
    "def highlight(item, query: str, side_len: int = 12) -> str:\n",
    "    positions = list()\n",
    "    query_words = list(jieba.cut(query))  # 把生成器强制转换为列表\n",
    "    i = 0\n",
    "    content_lower = item[2].lower()\n",
    "    word_start_map = list()\n",
    "    word_end_map = list()\n",
    "    last_word_end = -1\n",
    "    len_content_lower = len(content_lower)\n",
    "    segments = list()\n",
    "    for keyword in query_words:\n",
    "        idx = content_lower.find(keyword.lower())\n",
    "        positions.append(idx)\n",
    "    for keyword in jieba.cut(content_lower):\n",
    "        # 用于实现提取摘要时“整词切分”，避免出现截取摘要时首尾的词被截断\n",
    "        current_word_start = last_word_end + 1\n",
    "        current_word_end = current_word_start + len(keyword) - 1\n",
    "        for _ in range(current_word_start, current_word_end+1):\n",
    "            word_start_map.append(current_word_start)\n",
    "            word_end_map.append(current_word_end)\n",
    "        last_word_end = current_word_end\n",
    "    positions.sort()\n",
    "    while i < len(positions):\n",
    "        start_pos = max(positions[i] - side_len, 0)\n",
    "        end_pos = min(positions[i] + side_len, len_content_lower-1)\n",
    "        # 用于实现合并相邻且有部分重合的摘要\n",
    "        while (i < len(positions) - 1) and (positions[i+1] - positions[i] < side_len*2):\n",
    "            end_pos = min(positions[i+1] + side_len, len_content_lower-1)\n",
    "            i += 1\n",
    "        start_ellipsis = '...' if start_pos > 0 else ''\n",
    "        end_ellipsis = '...' if end_pos < len_content_lower else ''\n",
    "        segments.append(start_ellipsis + item[2][word_start_map[start_pos]: word_end_map[end_pos]] + end_ellipsis)\n",
    "        i += 1\n",
    "    result = text = item[1] + '\\n' + ''.join(segments)\n",
    "    text_lower = text.lower()\n",
    "    for keyword in query_words:\n",
    "        # 高亮部分\n",
    "        idx = text_lower.find(keyword.lower())\n",
    "        if idx >= 0:\n",
    "            ori_word = text[idx:idx+(len(keyword))]\n",
    "            result = result.replace(ori_word, f'<span style=\"color:red\";>{ori_word}</span>')\n",
    "    return result\n",
    "\n",
    "\n",
    "class MySearcherC11V0:\n",
    "    \"\"\"\n",
    "    第十次课升级的搜索类版本：\n",
    "    用文档频率(DF)对词进行加权\n",
    "    \"\"\"\n",
    "    def __init__(self):\n",
    "        self.docs = list()  # 所有文档原始数据\n",
    "        self.load_data()\n",
    "        self.cache = dict()\n",
    "        self.vocab = set()  # 索引词表\n",
    "        self.lower_preprocess()\n",
    "        jieba.load_userdict('./dict.txt')\n",
    "        self.df = dict()\n",
    "        self.build_cache()\n",
    "\n",
    "    def load_data(self, data_file_name='./news_list.pkl'):\n",
    "        if os.path.exists(data_file_name):\n",
    "            self.docs = Spider.pickle_load(data_file_name)\n",
    "        else:\n",
    "            Spider.pickle_save(data_file_name)\n",
    "            self.docs = Spider.pickle_load(data_file_name)\n",
    "\n",
    "    def search(self, query):\n",
    "        result = None\n",
    "        for keyword in jieba.cut(query.lower()):\n",
    "            if keyword in self.cache:\n",
    "                if result is None:\n",
    "                    result = self.cache[keyword]\n",
    "                else:\n",
    "                    result = result & self.cache[keyword]\n",
    "            else:\n",
    "                result = set()\n",
    "                break\n",
    "        if result is None:\n",
    "            result = set()\n",
    "        sorted_result = self.rank(query, result)\n",
    "        return sorted_result\n",
    "\n",
    "    def rank(self, query, result_set):\n",
    "        result = list()\n",
    "        for doc_id in result_set:\n",
    "            result.append([doc_id, self.score(self.docs[doc_id], query)])\n",
    "        result.sort(key=lambda x: x[1], reverse=True)\n",
    "        return result\n",
    "\n",
    "    def render_search_result(self, query):\n",
    "        \"\"\"\n",
    "        返回带有高亮和摘要的查询结果\n",
    "        \"\"\"\n",
    "        count = 0\n",
    "        result = ''\n",
    "        for item in self.search(query):\n",
    "            count += 1\n",
    "            result += f'{count}[{item[1]}] {highlight(self.docs[item[0]], query)}\\n'\n",
    "        return result\n",
    "\n",
    "    def score(self, item, query):\n",
    "        score = 0\n",
    "        for keyword in jieba.cut(query.lower()):\n",
    "            title_score = item[1].lower().count(keyword.lower())\n",
    "            content_score = item[2].lower().count(keyword.lower())\n",
    "            score += (title_score * 5 + content_score * 3) / len(item[2]) / self.df[keyword]\n",
    "        return score\n",
    "\n",
    "    def build_cache(self):\n",
    "        \"\"\"\n",
    "        用分词（用文档过滤词库）的方式构建索引\n",
    "        \"\"\"\n",
    "        doc_id = 0\n",
    "        for doc in self.docs:\n",
    "            doc_word_set = set()\n",
    "            for word in jieba.cut_for_search(doc[3]):\n",
    "                if word not in doc_word_set:\n",
    "                    result_item = doc_id\n",
    "                    if word not in self.cache:\n",
    "                        self.cache[word] = {result_item}\n",
    "                    else:\n",
    "                        self.cache[word].add(result_item)\n",
    "                    self.vocab.add(word)\n",
    "                    doc_word_set.add(word)\n",
    "                    if word in self.df:\n",
    "                        self.df[word] += 1\n",
    "                    else:\n",
    "                        self.df[word] = 1\n",
    "            doc_id += 1\n",
    "\n",
    "    def lower_preprocess(self):\n",
    "        for doc_id in range(len(self.docs)):\n",
    "            self.docs[doc_id].append(\n",
    "                (self.docs[doc_id][1] + ' ' + self.docs[doc_id][2]).lower())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "outputs": [],
   "source": [
    "from math import log10\n",
    "class MySearcherC11V1(MySearcherC11V0):\n",
    "    \"\"\"\n",
    "    改善文档频和文档长度加权的影响\n",
    "    改善IDF权值\n",
    "    \"\"\"\n",
    "    def score(self, item, query):\n",
    "        score = 0\n",
    "        for keyword in jieba.cut(query.lower()):\n",
    "            title_score = item[1].lower().count(keyword.lower())\n",
    "            content_score = item[2].lower().count(keyword.lower())\n",
    "            tf = (title_score * 2 + content_score * 1) / log10(len(item[2]))\n",
    "            idf = log10(len(self.docs) / log10(self.df[keyword] + 0.01))\n",
    "            score += tf * idf\n",
    "        return score"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 2.49 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "searcher_v1 = MySearcherC11V1()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1[45.33659621431827] 传<span style=\"color:red\";>华为</span>预计今年智能手机出货量同比减少60%，降至7000万部\n",
      "...，据供应链消息人士透露，<span style=\"color:red\";>华为</span>已通知其供应商，预计......，因为美国政府禁止其进口<span style=\"color:red\";>5G</span>机型的零部件。许多供应...\n",
      "2[27.96582616358055] <span style=\"color:red\";>华为</span>将在英国起诉汇丰，要拿到孟晚舟案关键文件\n",
      "...消息，当地时间12日，中国<span style=\"color:red\";>华为</span>公司首席财务官孟晚舟......，也有关于<span style=\"color:red\";>华为</span>在欧洲拓展<span style=\"color:red\";>5G</span>业务不是很好的消息。一...\n",
      "3[25.321727082223212] <span style=\"color:red\";>华为</span>推\"智慧养猪\"，任正非曾称如果养猪可能也是状元\n",
      "...生猪养殖业真香，连电信巨头<span style=\"color:red\";>华为</span>都想进去分一杯羹了。近......发展论坛上，<span style=\"color:red\";>华为</span>就发表了《<span style=\"color:red\";>5G</span>引领现代猪场AI使能智...\n",
      "4[24.167628831967857] <span style=\"color:red\";>华为</span>推“智慧养猪”，任正非：<span style=\"color:red\";>华为</span>不靠手机也能活\n",
      "...近日，任正非首次公开提及<span style=\"color:red\";>华为</span>“南泥湾”计划，即生产自......发展论坛上，<span style=\"color:red\";>华为</span>就发表了《<span style=\"color:red\";>5G</span>引领现代AI使能智慧养...\n",
      "5[19.654983631278615] <span style=\"color:red\";>华为</span>折叠屏手机<span style=\"color:red\";>华为</span>MateX2将于下周一20时发布\n",
      "财联社2月18日讯，据<span style=\"color:red\";>华为</span>官微，<span style=\"color:red\";>华为</span>折叠屏手机......出售终端业务。我们可以转让<span style=\"color:red\";>5G</span>技术，但绝不会出售终...\n",
      "6[18.317293331043253] 除了欢迎拜登致电<span style=\"color:red\";>华为</span>，任正非还谈了孟晚舟、退休时间、<span style=\"color:red\";>5G</span>转让等\n",
      "...任正非接受中外媒体采访，就<span style=\"color:red\";>华为</span>发展、产业、个人生活......，也有关于<span style=\"color:red\";>华为</span>在欧洲拓展<span style=\"color:red\";>5G</span>业务不是很好的消息。一...\n",
      "7[17.880048852263627] <span style=\"color:red\";>华为</span>供应链公司：已向<span style=\"color:red\";>华为</span>P50手机供货，供货时间有延后\n",
      "据<span style=\"color:red\";>华为</span>手机供应链公司，该公......，因为美国政府禁止其进口<span style=\"color:red\";>5G</span>机型的零部件。许多供应...\n",
      "8[13.458084783581613] 争国内第一！荣耀想打败<span style=\"color:red\";>华为</span>小米，靠3599的V40行么？\n",
      "...月前，荣耀正式宣布独立。<span style=\"color:red\";>华为</span>官方称这是“产业链的一......透露：“正在着手荣耀的2个<span style=\"color:red\";>5G</span>项目。”但是，媒体对...\n",
      "9[11.419042782242016] 任正非下个梦想：让煤矿工人可以穿西装打领带上班\n",
      "...9日，由山西省人民政府、<span style=\"color:red\";>华为</span>技术有限公司、晋能控股集......矿业智能化发展探索方向。<span style=\"color:red\";>5G</span>应用上，世界上多数信...\n",
      "10[3.2380682655740025] 小米国内机型不再支持自行安装GMS框架，国际版不受影响\n",
      "...年5月16日后，美国制裁<span style=\"color:red\";>华为</span>，谷歌的GMS不再为华......纪念版、Redmi 10X <span style=\"color:red\";>5G</span>等。1月31日晚间，小...\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(searcher_v1.render_search_result('华为5G'))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "outputs": [],
   "source": [
    "class MySearcherC11V2(MySearcherC11V0):\n",
    "    \"\"\"\n",
    "    BM25打分算法\n",
    "    \"\"\"\n",
    "    def __init__(self):\n",
    "        self.avg_dl = 0\n",
    "        super().__init__()\n",
    "\n",
    "    def build_cache(self):\n",
    "        \"\"\"\n",
    "        用分词（用文档过滤词库）的方式构建索引\n",
    "        \"\"\"\n",
    "        doc_id = 0\n",
    "        doc_length_sum = 0\n",
    "        for doc in self.docs:\n",
    "            doc_word_set = set()\n",
    "            doc_length_sum += len(doc[3])\n",
    "            for word in jieba.cut_for_search(doc[3]):\n",
    "                if word not in doc_word_set:\n",
    "                    result_item = doc_id\n",
    "                    if word not in self.cache:\n",
    "                        self.cache[word] = {result_item}\n",
    "                    else:\n",
    "                        self.cache[word].add(result_item)\n",
    "                    self.vocab.add(word)\n",
    "                    doc_word_set.add(word)\n",
    "                    if word in self.df:\n",
    "                        self.df[word] += 1\n",
    "                    else:\n",
    "                        self.df[word] = 1\n",
    "            doc_id += 1\n",
    "        self.avg_dl = doc_length_sum / len(self.docs)\n",
    "\n",
    "\n",
    "    def score(self, item, query, k1 = 2, b = 0.75):\n",
    "        score = 0\n",
    "        for keyword in jieba.cut(query.lower()):\n",
    "            f = item[2].lower().count(keyword.lower())\n",
    "            dl = len(item[2])\n",
    "            tf = (f * (k1 + 1)) / (f + k1 * (1 - b + b * (dl / self.avg_dl)))\n",
    "            idf = log10((len(self.docs) - self.df[keyword] + 0.5) / (self.df[keyword] + 0.5))\n",
    "            score += tf * idf\n",
    "        return score"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 1.58 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "searcher_v2 = MySearcherC11V2()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1[6.638733073087041] <span style=\"color:red\";>华为</span>供应链公司：已向<span style=\"color:red\";>华为</span>P50<span style=\"color:red\";>手机</span>供货，供货时间有延后\n",
      "据<span style=\"color:red\";>华为</span><span style=\"color:red\";>手机</span>供应链公司，该公司已逐......，因为美国政府禁止其进口<span style=\"color:red\";>5G</span>机型的零部件。许多供应...\n",
      "2[6.514435315643027] 传<span style=\"color:red\";>华为</span>预计今年智能<span style=\"color:red\";>手机</span>出货量同比减少60%，降至7000万部\n",
      "...，据供应链消息人士透露，<span style=\"color:red\";>华为</span>已通知其供应商，预计......7000万至8000万部智能<span style=\"color:red\";>手机</span>的零部件。而且<span style=\"color:red\";>华为</span>的零部......，因为美国政府禁止其进口<span style=\"color:red\";>5G</span>机型的零部件。许多供应...\n",
      "3[6.154567506099941] 除了欢迎拜登致电<span style=\"color:red\";>华为</span>，任正非还谈了孟晚舟、退休时间、<span style=\"color:red\";>5G</span>转让等\n",
      "...任正非接受中外媒体采访，就<span style=\"color:red\";>华为</span>发展、产业、个人生活......发生了很多事，有关于<span style=\"color:red\";>华为</span><span style=\"color:red\";>手机</span>出货量的报道，也有关于<span style=\"color:red\";>华为</span>在欧洲拓展<span style=\"color:red\";>5G</span>业务不是很好的消息。一...\n",
      "4[5.98804822098848] <span style=\"color:red\";>华为</span>将在英国起诉汇丰，要拿到孟晚舟案关键文件\n",
      "...消息，当地时间12日，中国<span style=\"color:red\";>华为</span>公司首席财务官孟晚舟......发生了很多事，有关于<span style=\"color:red\";>华为</span><span style=\"color:red\";>手机</span>出货量的报道，也有关于<span style=\"color:red\";>华为</span>在欧洲拓展<span style=\"color:red\";>5G</span>业务不是很好的消息。一...\n",
      "5[5.8397968218966865] <span style=\"color:red\";>华为</span>推\"智慧养猪\"，任正非曾称如果养猪可能也是状元\n",
      "...生猪养殖业真香，连电信巨头<span style=\"color:red\";>华为</span>都想进去分一杯羹了。近......突破，任正非表示，<span style=\"color:red\";>华为</span>不靠<span style=\"color:red\";>手机</span>业务也可以存活。此前任正......发展论坛上，<span style=\"color:red\";>华为</span>就发表了《<span style=\"color:red\";>5G</span>引领现代猪场AI使能智...\n",
      "6[5.785736928141479] <span style=\"color:red\";>华为</span>推“智慧养猪”，任正非：<span style=\"color:red\";>华为</span>不靠<span style=\"color:red\";>手机</span>也能活\n",
      "...近日，任正非首次公开提及<span style=\"color:red\";>华为</span>“南泥湾”计划，即生产自......突破，任正非表示，<span style=\"color:red\";>华为</span>不靠<span style=\"color:red\";>手机</span>业务也可以存活。随后......发展论坛上，<span style=\"color:red\";>华为</span>就发表了《<span style=\"color:red\";>5G</span>引领现代AI使能智慧养...\n",
      "7[5.7716095866827875] <span style=\"color:red\";>华为</span>折叠屏<span style=\"color:red\";>手机</span><span style=\"color:red\";>华为</span>MateX2将于下周一20时发布\n",
      "财联社2月18日讯，据<span style=\"color:red\";>华为</span>官微，<span style=\"color:red\";>华为</span>折叠屏<span style=\"color:red\";>手机</span>2将于2月22日20......出售终端业务。我们可以转让<span style=\"color:red\";>5G</span>技术，但绝不会出售终...\n",
      "8[4.816802219940865] 争国内第一！荣耀想打败<span style=\"color:red\";>华为</span>小米，靠3599的V40行么？\n",
      "...月前，荣耀正式宣布独立。<span style=\"color:red\";>华为</span>官方称这是“产业链的一......问题。更重要的是，其他国产<span style=\"color:red\";>手机</span>品牌已经虎视眈眈，瞄......透露：“正在着手荣耀的2个<span style=\"color:red\";>5G</span>项目。”但是，媒体对...\n",
      "9[4.055546877571298] 小米国内机型不再支持自行安装GMS框架，国际版不受影响\n",
      "...注意到一些报道中提及小米<span style=\"color:red\";>手机</span>不支持GMS服务，事......年5月16日后，美国制裁<span style=\"color:red\";>华为</span>，谷歌的GMS不再为华......纪念版、Redmi 10X <span style=\"color:red\";>5G</span>等。1月31日晚间，小...\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(searcher_v2.render_search_result('华为5G手机'))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}